library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(biomaRt) ; library(DESeq2) ; library(sva) ; library(vsn) ; library(WGCNA) ; 
library(GEOquery) ; library(lumi) ; library(limma)
library(dendextend) ; library(expss)
library(knitr) ; library(kableExtra)

Raw data



Dataset downloaded from Arkinglab website in the Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism section.

Load and annotate data

# NCBI biotype annotation
NCBI_biotype = read.csv('./../../../NCBI/Data/gene_biotype_info.csv') %>% 
               dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene, 
                             'hgnc_symbol'=Symbol) %>% 
               mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))

###############################################################################################################
# DOWNLOAD AND CLEAN GENE EXPRESSION DATA

# Gene expression data downloaded directly from GEO because the one returned with get GEO was already preprocessed
datExpr = lumiR('./../Data/GSE28521_non-normalized_data.txt.gz') %>% as.matrix
## Perform Quality Control assessment of the LumiBatch object ...
rownames_datExpr = rownames(datExpr) 
datExpr = datExpr %>% data.frame %>% mutate_all(function(x) x %>% as.numeric)
rownames(datExpr) = rownames_datExpr

# Download Metadata
GEO_data = getGEO('GSE28521', destdir='./../Data/')[[1]]
#datExpr = exprs(GEO_data) %>% data.frame   # Already filtered and normalised data  
#datGenes = fData(GEO_data)                 # Only includes information for the filtered genes
datMeta = pData(GEO_data) %>% mutate('ID' = geo_accession, 'Brain_lobe' = `tissue (brain region):ch1`, 
                                     'Diagnosis' = factor(ifelse(`disease status:ch1`=='autism', 'ASD','CTL'), 
                                                          levels = c('CTL','ASD')),
                                     'Subject_ID' = substring(title,1,9)) %>%
          dplyr::select(ID, title, Subject_ID, description, Diagnosis, Brain_lobe)
rownames(datMeta) = paste0('X',datMeta$description)


# LABEL GENES WITH ENSEMBL IDS (THEY COME WITH ILLUMINA IDS)
# Get Biomart information
ensembl = useMart('ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org') 
getinfo = c('illumina_humanref_8_v3', 'ensembl_gene_id','external_gene_id', 'entrezgene', 'hgnc_symbol',
            'chromosome_name', 'start_position', 'end_position')
datGenes = getBM(attributes = getinfo, filters='illumina_humanref_8_v3', values = rownames(datExpr), 
                 mart = ensembl)
datGenes = datGenes %>% left_join(NCBI_biotype %>% dplyr::select(-hgnc_symbol), by = 'ensembl_gene_id') %>% 
           add_count(illumina_humanref_8_v3) %>% filter(n == 1 | gene_biotype == 'protein_coding') %>%
           distinct(illumina_humanref_8_v3, .keep_all = TRUE) %>% 
           mutate(length = end_position - start_position)

# Match DatExpr and datGenes rows, and datExpr columns and datMeta rows
datExpr = datExpr[rownames(datExpr) %in% datGenes$illumina_humanref_8_v3,]
datGenes = datGenes[match(rownames(datExpr), datGenes$illumina_humanref_8_v3),]
datMeta = datMeta[match(colnames(datExpr),rownames(datMeta)),]


###############################################################################################################


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# NCBI biotype annotation
NCBI_biotype = read.csv('./../../../NCBI/Data/gene_biotype_info.csv') %>% 
               dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene, 
                             'hgnc_symbol'=Symbol) %>% 
               mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))


gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n+1)
  pal = hcl(h = hues, l = 65, c = 100)[1:n]
}

rm(GEO_data, GO_annotations, ensembl, getinfo, mart, rownames_datExpr)

Check sample composition



RNA-Seq for 79 cortical brain-tissue samples across frontal, temporal lobes and cerebellum, comprising 29 samples from control subjects and 29 samples from ASD subjects


The dataset includes 23527 genes from 79 samples belonging to 33 different subjects.

Counts distribution: Heavy right tail

counts = datExpr %>% melt %>% mutate(value = value %>% as.numeric)

count_distr = data.frame('Statistic' = c('Min', '1st Quartile', 'Median', 'Mean', '3rd Quartile', 'Max'),
                         'Values' = c(min(counts$value), quantile(counts$value, probs = c(.25, .5)) %>% unname,
                                      mean(counts$value), quantile(counts$value, probs = c(.75)) %>% unname,
                                      max(counts$value)))

count_distr %>% kable(digits = 2, format.args = list(scientific = FALSE)) %>% kable_styling(full_width = F)
Statistic Values
Min 78.38
1st Quartile 245.22
Median 375.93
Mean 1487.09
3rd Quartile 1051.63
Max 54121.42
rm(counts, count_distr)


Diagnosis distribution by Sample: Balanced

table_info = datMeta %>% apply_labels(Diagnosis = 'Diagnosis', Brain_lobe = 'Brain Lobe')
cro(table_info$Diagnosis)
 #Total 
 Diagnosis 
   CTL  40
   ASD  39
   #Total cases  79


Diagnosis distribution by Subject: Balanced

cro(table_info$Diagnosis[!duplicated(table_info$Subject_ID)])
 #Total 
 Diagnosis 
   CTL  17
   ASD  16
   #Total cases  33

Brain region distribution: The Frontal lobe has more samples, but they are quite balanced

cro(table_info$Brain_lobe)
 #Total 
 Brain Lobe 
   Cerebellum  21
   Frontal cortex  32
   Temporal cortex  26
   #Total cases  79


Balanced

cro(table_info$Diagnosis, list(table_info$Brain_lobe,total()))
 Brain Lobe     #Total 
 Cerebellum   Frontal cortex   Temporal cortex   
 Diagnosis 
   CTL  11 16 13   40
   ASD  10 16 13   39
   #Total cases  21 32 26   79


Note: No age or gender information in the metadata :/



Annotate genes with BioMart information



I was originally running this with the feb2014 version of BioMart because that’s the one that Gandal used (and it finds all of the Ensembl IDs, which other versions don’t), but it has some outdated biotype annotations, to fix this I’ll obtain all the information except the biotype label from BioMart in the same way as it had been done before, and then I’ll add the most current biotype label using information from NCBI’s website and then from BioMart in the following way:

  1. Use BioMart to run a query with the original feb2014 version using the Ensembl IDs as keys to obtain all the information except the biotype labels (we are not going to retrieve the gene name from biomart because we already extracted it from datExpr)

  2. Annotate genes with Biotype labels:

    2.1 Use the NCBI annotations downloaded from NCBI’s website and processed in NCBI/RMarkdowns/clean_data.html (there is information for only 26K genes, so some genes will remain unlabelled)

    2.2 Use the current version (jan2020) to obtain the biotype annotations using the Ensembl ID as keys (some genes don’t return a match)

    2.3 For the genes that didn’t return a match, use the current version (jan2020) to obtain the biotype annotations using the gene name as keys

    2.4 For the genes that returned multiple labels, use the feb2014 version with the Ensembl IDs as keys


Note: A small proportion of genes don’t make a match in any of these queries, so they will be lost when we start filtering out genes

labels_source = data.frame('source' = c('NCBI', 'BioMart2020_byID', 'BioMart2020_byGene', 'BioMart2014'),
                                      'n_matches' = rep(0,4))

########################################################################################
# 1. Query archive version

# We already did this part when loading the data

cat(paste0('1. ', sum(is.na(datGenes$start_position)), '/', nrow(datGenes),
             ' Ensembl IDs weren\'t found in the feb2014 version of BioMart'))
## 1. 0/23527 Ensembl IDs weren't found in the feb2014 version of BioMart
########################################################################################
########################################################################################
# 2. Get Biotype Labels

cat('2. Add biotype information')
## 2. Add biotype information
########################################################################################
# 2.1 Add NCBI annotations

# Already did this as well

cat(paste0('2.1 ' , sum(is.na(datGenes$gene_biotype)), '/', nrow(datGenes),
             ' Ensembl IDs weren\'t found in the NCBI database'))
## 2.1 414/23527 Ensembl IDs weren't found in the NCBI database
labels_source$n_matches[1] = sum(!is.na(datGenes$gene_biotype))

########################################################################################
# 2.2 Query current BioMart version for gene_biotype using Ensembl ID as key

getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
               host = 'jan2020.archive.ensembl.org')
datGenes_biotype = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), mart=mart, 
                         values = datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])

cat(paste0('2.2 ' , sum(is.na(datGenes$gene_biotype))-nrow(datGenes_biotype), '/', 
           sum(is.na(datGenes$gene_biotype)),
           ' Ensembl IDs weren\'t found in the jan2020 version of BioMart when querying by Ensembl ID'))
## 2.2 217/414 Ensembl IDs weren't found in the jan2020 version of BioMart when querying by Ensembl ID
# Add new gene_biotype info to datGenes
datGenes = datGenes %>% left_join(datGenes_biotype, by='ensembl_gene_id') %>%
           mutate(gene_biotype = coalesce(as.character(gene_biotype.x), gene_biotype.y)) %>%
           dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[2] = sum(!is.na(datGenes$gene_biotype)) - labels_source$n_matches[1]

########################################################################################
# 3. Query current BioMart version for gene_biotype using gene symbol as key

missing_genes = unique(datGenes$external_gene_id[is.na(datGenes$gene_biotype)])
getinfo = c('hgnc_symbol','gene_biotype')
datGenes_biotype_by_gene = getBM(attributes = getinfo, filters = c('hgnc_symbol'), mart = mart,
                                 values = missing_genes)

cat(paste0('2.3 ', length(missing_genes)-length(unique(datGenes_biotype_by_gene$hgnc_symbol)),'/',
           length(missing_genes),
           ' genes weren\'t found in the current BioMart version when querying by gene name'))
## 2.3 40/169 genes weren't found in the current BioMart version when querying by gene name
dups = unique(datGenes_biotype_by_gene$hgnc_symbol[duplicated(datGenes_biotype_by_gene$hgnc_symbol)])
cat(paste0('    ', length(dups), ' genes returned multiple labels (these won\'t be added)'))
##     2 genes returned multiple labels (these won't be added)
# Update information
datGenes_biotype_by_gene = datGenes_biotype_by_gene %>% filter(!hgnc_symbol %in% dups)
datGenes = datGenes %>% left_join(datGenes_biotype_by_gene, by=c('external_gene_id'='hgnc_symbol')) %>% 
           mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
           dplyr::select(-gene_biotype.x, -gene_biotype.y) %>%
           mutate(hgnc_symbol = ifelse(is.na(hgnc_symbol), external_gene_id, hgnc_symbol))

labels_source$n_matches[3] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)

########################################################################################
# 4. Query feb2014 BioMart version for the missing biotypes

missing_ensembl_ids = unique(datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])

getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl', 
               host = 'feb2014.archive.ensembl.org')
datGenes_biotype_archive = getBM(attributes = getinfo, filters=c('ensembl_gene_id'), 
                                 values = missing_ensembl_ids, mart=mart)

cat(paste0('2.4 ', length(missing_ensembl_ids)-nrow(datGenes_biotype_archive),'/',length(missing_ensembl_ids),
             ' genes weren\'t found in the feb2014 BioMart version when querying by Ensembl ID'))
## 2.4 0/42 genes weren't found in the feb2014 BioMart version when querying by Ensembl ID
# Update information
datGenes = datGenes %>% left_join(datGenes_biotype_archive, by='ensembl_gene_id') %>% 
            mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
            dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[4] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)

########################################################################################
# Plot results

labels_source = labels_source %>% add_row(source = 'missing', 
                                          n_matches = nrow(datGenes) - sum(labels_source$n_matches)) %>% 
                mutate(x = 1, percentage = round(100*n_matches/sum(n_matches),2),
                       source = factor(source, levels=c('BioMart2014','BioMart2020_byGene','BioMart2020_byID',
                                                        'NCBI','missing')))
                

p = labels_source %>% ggplot(aes(x, percentage, fill=source)) + geom_bar(position='stack', stat='identity') +
    theme_minimal() + coord_flip() + theme(legend.position='bottom', axis.title.y=element_blank(),
    axis.text.y=element_blank(), axis.ticks.y=element_blank()) + ylab('Percentage of genes') +
    scale_fill_manual(values = c(gg_colour_hue(nrow(labels_source)-1),'gray'))

ggplotly(p + theme(legend.position='none'))
as_ggplot(get_legend(p))

########################################################################################
# Reorder rows to match datExpr
datGenes = datGenes[match(rownames(datExpr), datGenes$illumina_humanref_8_v3),]


rm(getinfo, mart, datGenes_BM, datGenes_biotype, datGenes_biotype_by_gene, datGenes_biotype_archive,
   dups, missing_ensembl_ids, missing_genes, labels_source, p)

Filtering



Checking how many SFARI genes are in the dataset

df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
n_SFARI = df[['gene-symbol']] %>% unique %>% length

Considering all genes, this dataset contains 860 of the 912 SFARI genes

1.- Filter entries for which we didn’t manage to obtain its genotype information

1.1 Missing Biotype

to_keep = !is.na(datGenes$gene_biotype)

datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]

Removed 0 ‘genes’, 23527 remaining

Filtering genes without biotype information, we are left with 860 SFARI Genes (we lose 0 genes)




2.- Filter genes that do not encode any protein


98% of the genes are protein coding genes

datGenes$gene_biotype %>% table %>% sort(decreasing=TRUE) %>% kable(caption='Biotypes of genes in dataset') %>%
                          kable_styling(full_width = F)
Biotypes of genes in dataset
. Freq
protein_coding 23049
1 157
3 119
processed_pseudogene 66
lncRNA 60
transcribed_unprocessed_pseudogene 28
unprocessed_pseudogene 20
transcribed_processed_pseudogene 14
lincRNA 4
transcribed_unitary_pseudogene 4
pseudogene 3
TEC 2
polymorphic_pseudogene 1

Non-protein coding genes in general have lower levels of expression than protein coding genes, but the difference is not that big

plot_data = data.frame('ID' = rownames(datExpr), 'MeanExpr' = rowMeans(datExpr),
                       'ProteinCoding' = datGenes$gene_biotype=='protein_coding')

ggplotly(plot_data %>% ggplot(aes(log2(MeanExpr+1), fill=ProteinCoding, color=ProteinCoding)) + 
         geom_density(alpha=0.5) + theme_minimal())
rm(plot_data)
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))

Filtering protein coding genes, we are left with 860 SFARI Genes (we lose 0 genes)

if(!all(rownames(datExpr)==rownames(datGenes))) cat('!!! gene rownames do not match!!!')
## !!! gene rownames do not match!!!
to_keep = datGenes$gene_biotype == 'protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)


Removed 478 genes. 23049 remaining


3.- Filter genes with low expression levels


Choosing the threshold:

Criteria for selecting filtering threshold: The minimum value in which the preprocessed data is relatively homoscedastic (we’re trying to get rid of the group of genes with very low mean and SD that make the cloud of points look like a comic book speech bubble)

datMeta_original = datMeta
datExpr_original = datExpr
datGenes_original = datGenes
thresholds = c(100, 200, 220, 250, 270, 300)

for(threshold in thresholds){
  
  datMeta = datMeta_original
  datExpr = datExpr_original
  datGenes = datGenes_original
  
  to_keep = apply(datExpr, 1, function(x) mean(x) >= threshold)
  datGenes = datGenes[to_keep,]
  datExpr = datExpr[to_keep,]
  
  # Filter outlier samples
  absadj = datExpr %>% bicor %>% abs
  netsummary = fundamentalNetworkConcepts(absadj)
  ku = netsummary$Connectivity
  z.ku = (ku-mean(ku))/sqrt(var(ku))
  
  to_keep = z.ku > -2
  datMeta = datMeta[to_keep,]
  datExpr = datExpr[,to_keep]
  
  
  # Normaise data using variance stabilisation normalisation
  LumiBatch = ExpressionSet(assayData = datExpr %>% as.matrix)
  pData(LumiBatch) = datMeta
  LumiBatch = lumiN(LumiBatch, method = 'vsn', verbose = FALSE)
  datExpr = exprs(LumiBatch)
  
  rm(absadj, netsummary, ku, z.ku, to_keep, LumiBatch)
  
  
  # Save summary results in dataframe
  if(threshold == thresholds[1]){
    mean_vs_sd_data = data.frame('threshold' = threshold, 'ID' = rownames(datExpr),
                                 'Mean' = rowMeans(datExpr), 'SD' = apply(datExpr,1,sd))
  } else {
    new_entries = data.frame('threshold' = threshold, 'ID' = rownames(datExpr),
                                 'Mean' = rowMeans(datExpr), 'SD' = apply(datExpr,1,sd))
    mean_vs_sd_data = rbind(mean_vs_sd_data, new_entries)
  }
}


# Visualise the effects of different thresholds
to_keep_1 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean<7] %>%
            as.character
to_keep_2 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean>=7]
to_keep_2 = to_keep_2 %>% sample(round(length(to_keep_2)/10)) %>% as.character
plot_data = mean_vs_sd_data[mean_vs_sd_data$ID %in% c(to_keep_1, to_keep_2),]

ggplotly(plot_data %>% ggplot(aes(Mean, SD)) + 
         geom_point(color='#0099cc', alpha=0.2, aes(id=ID, frame=threshold)) + 
         scale_x_log10() + scale_y_log10() + theme_minimal())
# Plot remaining genes
plot_data = mean_vs_sd_data %>% group_by(threshold) %>% tally

ggplotly(plot_data %>% ggplot(aes(threshold, n)) + geom_point() + geom_line() +
         theme_minimal() + ggtitle('Remaining genes for each filtering threshold'))
rm(to_keep_1, to_keep_2, plot_data, dds, thresholds)
# Return to original variables
datExpr = datExpr_original
datGenes = datGenes_original
datMeta = datMeta_original


rm(datExpr_original, datGenes_original, datMeta_original)


Selecting a threshold of 220

# Minimum percentage of non-zero entries allowed per gene
threshold = 220

plot_data = data.frame('id'=rownames(datExpr),
                       'threshold' = apply(datExpr, 1, function(x) mean(x)))

ggplotly(plot_data %>% ggplot(aes(x=threshold)) + 
         geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) + 
         geom_vline(xintercept=threshold, color='gray') + 
         xlab('Mean level of expression') + ylab('Density') + scale_x_log10() +
         ggtitle('Mean level of expression by Gene') + theme_minimal())
to_keep = apply(datExpr, 1, function(x) mean(x) >= threshold)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]

Removed 2652 genes. 20397 remaining

825 SFARI genes remaining (we lost 35 genes)

n_SFARI = SFARI_genes[['gene-symbol']][SFARI_genes$ID %in%datGenes$ensembl_gene_id] %>% unique %>% length

rm(threshold, plot_data, to_keep)


4.- Filter outlier samples

Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)

absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$ID, 
                       'Subject_ID'=datMeta$Subject_ID, 'Brain_Lobe'=datMeta$Brain_lobe, 
                       'Diagnosis'=datMeta$Diagnosis)

selectable_scatter_plot(plot_data, plot_data[,-c(1:3)])

Outlier samples: GSM706410, GSM706394, GSM706425, GSM706391, GSM706455

to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

rm(absadj, netsummary, ku, z.ku, plot_data)

Removed 5 samples, 74 remaining

rm(to_keep)





5.- Filter repeated genes

There are 4939 genes with more than one ensembl ID in the dataset. To accurately refer to the rows of my data as ‘genes’, I’m going to remove the repeated ones.

dup_genes = datGenes$hgnc_symbol %>% duplicated

datGenes = datGenes[!dup_genes,]
datExpr = datExpr[!dup_genes,]

Removed 4939 genes. 15458 remaining

825 SFARI genes remaining (we lost 0 genes)

rm(dup_genes, n_SFARI)



After filtering, the dataset consists of 15458 genes and 74 samples

Save filtered and annotated dataset

save(datExpr, datMeta, datGenes, file='./../Data/filtered_raw_data.RData')
#load('./../Data/filtered_raw_data.RData')

Batch Effects Exploratory Analysis



There are no batch surrogate variables in this dataset

Cerebellum samples seem to cluster together. There doesn’t seem to be a strong pattern related to Diagnosis

h_clusts = datExpr %>% t %>% dist %>% hclust %>% as.dendrogram

dend_meta = datMeta[match(labels(h_clusts), rownames(datMeta)),] %>% 
            mutate('Diagnosis' = ifelse(Diagnosis=='CTL','#008080','#86b300'),  # Blue control, Green ASD
                   'Region' = case_when(Brain_lobe=='Frontal cortex'~'#F8766D', # ggplot defaults for 3 colours
                                        Brain_lobe=='Temporal cortex'~'#00BA38',
                                        TRUE~'#619CFF')) %>%
            dplyr::select(Region, Diagnosis)
h_clusts %>% dendextend::set('labels', rep('', nrow(datMeta))) %>% 
             dendextend::set('branches_k_color', k=9) %>% plot
colored_bars(colors=dend_meta)

rm(h_clusts, dend_meta)



Looking for unknown sources of batch effects


Following the pipeline from Surrogate variable analysis: hidden batch effects where sva is used with DESeq2:

Create a lumi object

# Normaise data using variance stabilisation normalisation
LumiBatch = ExpressionSet(assayData = datExpr %>% as.matrix)
pData(LumiBatch) = datMeta
LumiBatch = lumiN(LumiBatch, method = 'vsn', verbose = FALSE)
datExpr_norm = exprs(LumiBatch)

Provide the normalized counts and two model matrices to SVA. The first matrix uses the biological condition, and the second model matrix is the null model.

# Perform vst
mod = model.matrix(~ Diagnosis, datMeta)
mod0 = model.matrix(~ 1, datMeta)
sva_fit = sva(datExpr_norm, mod=mod, mod0=mod0)
## Number of significant surrogate variables is:  7 
## Iteration (out of 5 ):1  2  3  4  5
rm(mod, mod0)

Found 7 surrogate variables, since there is no direct way to select which ones to pick Bioconductor answer, kept all of them.

Include SV estimations to datMeta information

sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV', 1:ncol(sv_data))

datMeta_sva = cbind(datMeta, sv_data)

rm(sv_data, sva_fit)




Normalisation and Differential Expression Analysis



Using the lumi package to perform normalisation

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + geom_abline(color='gray') +
              scale_x_log10() + scale_y_log10() + theme_minimal()

rm(plot_data)

Using vsn to normalise the data

# Normaise data using variance stabilisation normalisation
LumiBatch = ExpressionSet(assayData = datExpr %>% as.matrix)
pData(LumiBatch) = datMeta
LumiBatch = lumiN(LumiBatch, method = 'vsn', verbose = FALSE)

datExpr_vst = exprs(LumiBatch)

# DEA
fit = lmFit(datExpr_vst, design=model.matrix(~Diagnosis, data = datMeta))
efit = eBayes(fit)
DE_info = topTable(efit,coef=2, number=Inf, sort.by='none')

rm(LumiBatch, fit)

Using the plotting function DESEq2’s manual proposes to study vst’s output it looks like the data could be homoscedastic

meanSdPlot(datExpr_vst, plot=FALSE)$gg + theme_minimal()

Plotting points individually we can notice some heteroscedasticity in the data in the genes with the lowest levels of expression

plot_data = data.frame('ID'=rownames(datExpr_vst), 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.2) + geom_smooth(color = 'gray') +
              scale_x_log10() + scale_y_log10() + theme_minimal()

rm(plot_data)



Rename normalised datasets to continue working with these

datExpr = datExpr_vst
datMeta = datMeta_sva %>% data.frame
#datGenes = datGenes_vst

rm(datExpr_vst, datMeta_vst, datMeta_sva)
## Warning in rm(datExpr_vst, datMeta_vst, datMeta_sva): object 'datMeta_vst' not
## found




Batch Effect Correction



SVA surrogate variables


In some places they say you shouldn’t correct these effects on the data because you risk losing biological variation, in others they say you should because they introduce noise to the data. The only thing everyone agrees on is that you shouldn’t remove them before performing DEA but instead include them in the model.

Based on the conclusions from Practical impacts of genomic data “cleaning” on biological discovery using surrogate variable analysis it seems like it may be a good idea to remove the batch effects from the data and not only from the DE analysis:

  • Using SVA, ComBat or related tools can increase the power to identify specific signals in complex genomic datasets (they found “greatly sharpened global and gene-specific differential expression across treatment groups”)

  • But caution should be exercised to avoid removing biological signal of interest

  • We must be precise and deliberate in the design and analysis of experiments and the resulting data, and also mindful of the limitations we impose with our own perspective

  • Open data exploration is not possible after such supervised “cleaning”, because effects beyond those stipulated by the researcher may have been removed

Comparing data with and without surrogate variable correction

# Taken from https://www.biostars.org/p/121489/#121500
correctDatExpr = function(datExpr, mod, svs) {
  X = cbind(mod, svs)
  Hat = solve(t(X) %*% X) %*% t(X)
  beta = (Hat %*% t(datExpr))
  rm(Hat)
  gc()
  P = ncol(mod)
  return(datExpr - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}

pca_samples_before = datExpr %>% t %>% prcomp
pca_genes_before = datExpr %>% prcomp

# Correct
mod = model.matrix(~ Diagnosis, datMeta)
svs = datMeta %>% dplyr::select(contains('SV')) %>% as.matrix
datExpr_corrected = correctDatExpr(as.matrix(datExpr), mod, svs)

pca_samples_after = datExpr_corrected %>% t %>% prcomp
pca_genes_after = datExpr_corrected %>% prcomp


rm(correctDatExpr)

Samples


The data is divided into two very different groups but the SVA manages to join them together and make Diagnosis the primary characteristic that separetes them

pca_samples_df = rbind(data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_before$x[,1],
                                  'PC2'=pca_samples_before$x[,2], 'corrected'=0),
                       data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_after$x[,1],
                                  'PC2'=pca_samples_after$x[,2], 'corrected'=1)) %>%
                 left_join(datMeta %>% mutate('ID'=rownames(datMeta)), by='ID')

ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(aes(frame=corrected, id=ID), alpha=0.75) + 
         xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,2],1))) +
         ggtitle('Samples') + theme_minimal())


The group of samples that has a very different behaviour to the rest of the genes at the beginning are the ones from the Cerebellum. After performing SVA, they are no longer recognisable

ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Brain_lobe)) + geom_point(aes(frame=corrected, id=ID), alpha=0.75) + 
         xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,2],1))) +
         ggtitle('Samples') + theme_minimal())
rm(pca_samples_df)

Genes


It seems like the sva correction preserves the mean expression of the genes and erases almost everything else (although what little else remains is enough to characterise the two Diagnosis groups pretty well using only the first PC)

*Plot is done with only 10% of the genes so it’s not that heavy

pca_genes_df = rbind(data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_before$x[,1],
                                'PC2'=pca_genes_before$x[,2], 'corrected'=0, 'MeanExpr'=rowMeans(datExpr)),
                     data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_after$x[,1],
                                'PC2'=pca_genes_after$x[,2], 'corrected'=1, 'MeanExpr'=rowMeans(datExpr)))

keep_genes = rownames(datExpr) %>% sample(0.1*nrow(datExpr))

pca_genes_df = pca_genes_df %>% filter(ID %in% keep_genes)

ggplotly(pca_genes_df %>% ggplot(aes(PC1, PC2,color=MeanExpr)) + 
         geom_point(alpha=0.3, aes(frame=corrected, id=ID)) +
         xlab(paste0('PC1 (corr=', round(cor(pca_genes_before$x[,1],pca_genes_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_genes_before$x[,2],pca_genes_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,2],1))) +
         scale_color_viridis() + ggtitle('Genes') + theme_minimal())
rm(pca_samples_before, pca_genes_before, mod, svs, pca_samples_after, pca_genes_after, pca_genes_df, keep_genes)

Everything looks good, so we’re keeping the corrected expression dataset

datExpr = datExpr_corrected


rm(datExpr_corrected)



Save preprocessed dataset

save(datExpr, datMeta, datGenes, DE_info, efit, file='./../Data/preprocessed_data.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.1.0            knitr_1.28                 
##  [3] expss_0.10.2                dendextend_1.13.4          
##  [5] limma_3.40.6                lumi_2.36.0                
##  [7] GEOquery_2.52.0             WGCNA_1.69                 
##  [9] fastcluster_1.1.25          dynamicTreeCut_1.63-1      
## [11] vsn_3.52.0                  sva_3.32.1                 
## [13] genefilter_1.66.0           mgcv_1.8-31                
## [15] nlme_3.1-147                DESeq2_1.24.0              
## [17] SummarizedExperiment_1.14.1 DelayedArray_0.10.0        
## [19] BiocParallel_1.18.1         matrixStats_0.56.0         
## [21] Biobase_2.44.0              GenomicRanges_1.36.1       
## [23] GenomeInfoDb_1.20.0         IRanges_2.18.3             
## [25] S4Vectors_0.22.1            BiocGenerics_0.30.0        
## [27] biomaRt_2.40.5              ggpubr_0.2.5               
## [29] magrittr_1.5                ggExtra_0.9                
## [31] GGally_1.5.0                gridExtra_2.3              
## [33] viridis_0.5.1               viridisLite_0.3.0          
## [35] RColorBrewer_1.1-2          plotlyutils_0.0.0.9000     
## [37] plotly_4.9.2                glue_1.4.1                 
## [39] reshape2_1.4.4              forcats_0.5.0              
## [41] stringr_1.4.0               dplyr_1.0.0                
## [43] purrr_0.3.4                 readr_1.3.1                
## [45] tidyr_1.1.0                 tibble_3.0.1               
## [47] ggplot2_3.3.2               tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_1.1.0         RSQLite_2.2.0            AnnotationDbi_1.46.1    
##   [4] htmlwidgets_1.5.1        grid_3.6.3               munsell_0.5.0           
##   [7] codetools_0.2-16         preprocessCore_1.46.0    nleqslv_3.3.2           
##  [10] miniUI_0.1.1.1           withr_2.2.0              colorspace_1.4-1        
##  [13] highr_0.8                rstudioapi_0.11          ggsignif_0.6.0          
##  [16] labeling_0.3             GenomeInfoDbData_1.2.1   farver_2.0.3            
##  [19] bit64_0.9-7              rhdf5_2.28.1             vctrs_0.3.1             
##  [22] generics_0.0.2           xfun_0.12                R6_2.4.1                
##  [25] doParallel_1.0.15        illuminaio_0.26.0        locfit_1.5-9.4          
##  [28] bitops_1.0-6             reshape_0.8.8            assertthat_0.2.1        
##  [31] promises_1.1.0           scales_1.1.1             nnet_7.3-14             
##  [34] gtable_0.3.0             affy_1.62.0              methylumi_2.30.0        
##  [37] rlang_0.4.6              splines_3.6.3            rtracklayer_1.44.4      
##  [40] lazyeval_0.2.2           acepack_1.4.1            impute_1.58.0           
##  [43] hexbin_1.28.1            broom_0.5.5              checkmate_2.0.0         
##  [46] BiocManager_1.30.10      yaml_2.2.1               modelr_0.1.6            
##  [49] crosstalk_1.1.0.1        GenomicFeatures_1.36.4   backports_1.1.8         
##  [52] httpuv_1.5.2             Hmisc_4.4-0              tools_3.6.3             
##  [55] nor1mix_1.3-0            affyio_1.54.0            ellipsis_0.3.1          
##  [58] siggenes_1.58.0          Rcpp_1.0.4.6             plyr_1.8.6              
##  [61] base64enc_0.1-3          progress_1.2.2           zlibbioc_1.30.0         
##  [64] RCurl_1.98-1.2           prettyunits_1.1.1        rpart_4.1-15            
##  [67] openssl_1.4.2            cowplot_1.0.0            bumphunter_1.26.0       
##  [70] haven_2.2.0              cluster_2.1.0            fs_1.4.0                
##  [73] data.table_1.12.8        reprex_0.3.0             hms_0.5.3               
##  [76] mime_0.9                 evaluate_0.14            xtable_1.8-4            
##  [79] XML_3.99-0.3             jpeg_0.1-8.1             mclust_5.4.6            
##  [82] readxl_1.3.1             compiler_3.6.3           minfi_1.30.0            
##  [85] KernSmooth_2.23-17       crayon_1.3.4             htmltools_0.4.0         
##  [88] later_1.0.0              Formula_1.2-3            geneplotter_1.62.0      
##  [91] lubridate_1.7.4          DBI_1.1.0                dbplyr_1.4.2            
##  [94] MASS_7.3-51.6            Matrix_1.2-18            cli_2.0.2               
##  [97] quadprog_1.5-8           pkgconfig_2.0.3          GenomicAlignments_1.20.1
## [100] foreign_0.8-76           xml2_1.2.5               foreach_1.5.0           
## [103] annotate_1.62.0          rngtools_1.5             webshot_0.5.2           
## [106] multtest_2.40.0          beanplot_1.2             XVector_0.24.0          
## [109] rvest_0.3.5              doRNG_1.8.2              scrime_1.3.5            
## [112] digest_0.6.25            Biostrings_2.52.0        rmarkdown_2.1           
## [115] base64_2.0               cellranger_1.1.0         htmlTable_1.13.3        
## [118] DelayedMatrixStats_1.6.1 curl_4.3                 Rsamtools_2.0.3         
## [121] shiny_1.4.0.2            lifecycle_0.2.0          jsonlite_1.7.0          
## [124] Rhdf5lib_1.6.3           askpass_1.1              fansi_0.4.1             
## [127] pillar_1.4.4             lattice_0.20-41          fastmap_1.0.1           
## [130] httr_1.4.1               survival_3.1-12          GO.db_3.8.2             
## [133] png_0.1-7                iterators_1.0.12         bit_1.1-15.2            
## [136] stringi_1.4.6            HDF5Array_1.12.3         blob_1.2.1              
## [139] latticeExtra_0.6-29      memoise_1.1.0